In [19]:
import os

import pandas as pd
import numpy as np

import seaborn as sns; sns.set()

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

from statsmodels.discrete.discrete_model import Logit, LogitResults

In [5]:
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)
pd.set_option('display.float_format', '{:.2f}'.format)

In [7]:
df = pd.read_pickle('../Chapter 7 - Data Preparation and Visualization/claims_df')

In [32]:
df.shape


Out[32]:
(4700, 38)

In [9]:
disease = ['SP_ALZHDMTA','SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN','SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA']
gender = ['gender_2']
ESRD = ['ESRD_Y']

In [10]:
X = df[disease+gender+ESRD]
y = df.HIGH_COST

In [11]:
lab_enc = LabelEncoder()

In [12]:
lab_enc.fit_transform(y)


Out[12]:
array([0, 0, 0, ..., 0, 0, 0])

In [110]:
lm = LogisticRegression()

In [111]:
lm.fit(X, y)


Out[111]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [112]:
lm.score(X, y)


Out[112]:
0.9027659574468085

In [113]:
def generate_conf_mat(model, X, y):
    conf_mat = confusion_matrix(model.predict(X), y)
    conf_mat = pd.DataFrame(conf_mat/conf_mat.sum())
    conf_mat.columns = ['0_pred', '1_pred']
    conf_mat.index=['0_actual', '1_actual']
    return conf_mat

In [114]:
conf_mat = generate_conf_mat(lm, X, y)

In [115]:
conf_mat


Out[115]:
0_pred 1_pred
0_actual 0.88 0.08
1_actual 0.02 0.02

Use statsmodels


In [116]:
lm2 = Logit(y, X)

In [117]:
results = lm2.fit()


Optimization terminated successfully.
         Current function value: 0.421470
         Iterations 7

In [118]:
results.summary()


Out[118]:
Logit Regression Results
Dep. Variable: HIGH_COST No. Observations: 4700
Model: Logit Df Residuals: 4687
Method: MLE Df Model: 12
Date: Thu, 21 Jun 2018 Pseudo R-squ.: -0.2946
Time: 11:27:17 Log-Likelihood: -1980.9
converged: True LL-Null: -1530.1
LLR p-value: 1.000
coef std err z P>|z| [0.025 0.975]
SP_ALZHDMTA -0.0405 0.101 -0.401 0.688 -0.238 0.157
SP_CHF -0.2308 0.097 -2.370 0.018 -0.422 -0.040
SP_CHRNKIDN 1.4570 0.121 12.066 0.000 1.220 1.694
SP_CNCR 0.1308 0.148 0.882 0.378 -0.160 0.421
SP_COPD 0.9319 0.116 8.040 0.000 0.705 1.159
SP_DEPRESSN -0.3392 0.098 -3.471 0.001 -0.531 -0.148
SP_DIABETES -0.8365 0.098 -8.507 0.000 -1.029 -0.644
SP_ISCHMCHT -1.1603 0.093 -12.432 0.000 -1.343 -0.977
SP_OSTEOPRS -0.2756 0.105 -2.635 0.008 -0.481 -0.071
SP_RA_OA -0.0401 0.112 -0.357 0.721 -0.260 0.180
SP_STRKETIA 1.0162 0.166 6.119 0.000 0.691 1.342
gender_2 -1.7737 0.075 -23.672 0.000 -1.921 -1.627
ESRD_Y 0.4462 0.137 3.265 0.001 0.178 0.714

Approach using svm


In [119]:
from sklearn.svm import LinearSVC

In [120]:
mod = svm.LinearSVC(class_weight='balanced')

In [121]:
mod.fit(X, y)


Out[121]:
LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [122]:
conf_mat = generate_conf_mat(mod, X, y)

In [123]:
conf_mat


Out[123]:
0_pred 1_pred
0_actual 0.73 0.02
1_actual 0.17 0.08

In [ ]:


In [ ]: